#ifndef _RDS_H
#define _RDS_H 

#include <net/sock.h>
#include <asm/scatterlist.h>

/*
 * XXX randomly chosen, but at least seems to be unused:
 * #               18464-18768 Unassigned
 * We should do better.  We want a reserved port to discourage unpriv'ed
 * userspace from listening.
 *
 * port 18633 was the version that had ack frames on the wire.
 */
#define RDS_PORT	18634


#ifndef AF_RDS
#define AF_RDS          28      /* Reliable Datagram Socket     */
#endif

#ifndef PF_RDS
#define PF_RDS          AF_RDS
#endif

#ifndef SOL_RDS
#define SOL_RDS         272
#endif

#define KERNEL_HAS_PROTO_REGISTER 1
#define KERNEL_HAS_INET_SK_RETURNING_INET_SOCK 1
#define KERNEL_HAS_CORE_CALLING_DEV_IOCTL 1

/* x86-64 doesn't include kmap_types.h from anywhere */
#include <asm/kmap_types.h>
#include <linux/highmem.h>

#include "info.h"

/* XXX crap, we need to worry about this conflicting too */
#define SYSCTL_NET_RDS 9912
#define SYSCTL_NET_RDS_IB 100

#ifdef DEBUG
#define rdsdebug(fmt, args...) pr_debug("%s(): " fmt, __func__ , ##args)
#else
/* sigh, pr_debug() causes unused variable warnings */
static inline void __attribute__ ((format (printf, 1, 2)))
rdsdebug(char *fmt, ...)
{
}
#endif

/*
 * This is the sad making.  Some kernels have a bug in the per_cpu() api which
 * makes DEFINE_PER_CPU trigger an oops on insmod because the per-cpu section
 * in the module is not cacheline-aligned.  As much as we'd like to tell users
 * with older kernels to stuff it, that's not reasonable.  We'll roll our own
 * until this doesn't have to build against older kernels.
 */
#define RDS_DEFINE_PER_CPU(type, var)  type var[NR_CPUS] 
#define RDS_DECLARE_PER_CPU(type, var)  extern type var[NR_CPUS] 
#define rds_per_cpu(var, cpu)  var[cpu] 

/* XXX is there one of these somewhere? */
#define ceil(x, y) \
	({ unsigned long __x = (x), __y = (y); (__x + __y - 1) / __y; })

#define RDS_CANCEL_SENT_TO	1
/* RDS_INFO_* starts at 10000 in info.h */

#define RDS_FRAG_SHIFT	12
#define RDS_FRAG_SIZE	((unsigned int)(1 << RDS_FRAG_SHIFT))

#define RDS_CONG_MAP_BYTES	(65536 / 8)
#define RDS_CONG_MAP_LONGS	(RDS_CONG_MAP_BYTES / sizeof(unsigned long))
#define RDS_CONG_MAP_PAGES	(PAGE_ALIGN(RDS_CONG_MAP_BYTES) / PAGE_SIZE)
#define RDS_CONG_MAP_PAGE_BITS	(PAGE_SIZE * 8)

struct rds_cong_map {
	struct rb_node		m_rb_node;
	__be32			m_addr;
	wait_queue_head_t	m_waitq;
	struct list_head	m_conn_list;
	unsigned long		m_page_addrs[RDS_CONG_MAP_PAGES];
};


/* 
 * This is used to stop the rds thread from calling into send or receive
 * while the connection is still connecting.  This stems from not being
 * able to cancel queued send and recv work in reconnect.
 */
#define RDS_CONN_CONNECTING	0
#define RDS_CONN_CONNECTED	1

struct rds_connection {
	struct hlist_node	c_hash_node;
	__be32			c_laddr;
	__be32			c_faddr;
	spinlock_t		c_lock;

	struct rds_cong_map	*c_lcong;
	struct rds_cong_map	*c_fcong;

	struct semaphore	c_send_sem;
	struct rds_message	*c_xmit_rm;
	unsigned long		c_xmit_sg;
	unsigned int		c_xmit_hdr_off;
	unsigned int		c_xmit_data_off;

	u64			c_next_tx_seq;
	struct list_head	c_send_queue;
	struct list_head	c_retrans;

	u64			c_next_rx_seq;

	struct rds_transport	*c_trans;
	void			*c_transport_data;

	unsigned long		c_status;
	unsigned long		c_reconnect_jiffies;
	struct delayed_work	c_send_w;
	struct delayed_work	c_recv_w;
	struct delayed_work	c_conn_w;
	struct work_struct	c_down_w;

	struct list_head	c_map_item;
	unsigned long		c_map_queued;
	unsigned long		c_map_offset;
	unsigned long		c_map_bytes;
};

#define RDS_FLAG_CONG_BITMAP 1

struct rds_header {
	__be64	h_sequence;
	__be32	h_len;
	__be16	h_sport;
	__be16	h_dport;
	u8	h_flags;
	u8	h_padding[7];
};

struct rds_incoming {
	atomic_t		i_refcount;
	struct list_head	i_item;
	struct rds_connection	*i_conn;
	struct rds_header	i_hdr;
	unsigned long		i_rx_jiffies;
	__be32			i_saddr;
};

/*
 * m_sock_item and m_conn_item are on lists that are serialized under
 * conn->c_lock.  m_sock_item has additional meaning in that once it is empty
 * the message will not be put back on the retransmit list after being sent.
 * messages that are canceled while being sent rely on this.
 *
 * m_inc is used by loopback so that it can pass an incoming message straight
 * back up into the rx path.  It embeds a wire header which is also used by
 * the send path, which is kind of awkward.
 *
 * m_sock_item indicates the message's presence on a socket's send or receive
 * queue.  m_rs will point to that socket.
 *
 * m_daddr is used by cancellation to prune messages to a given destination.
 *
 * The RDS_MSG_ON_SOCK and RDS_MSG_ON_CONN flags are used to avoid lock
 * nesting.  As paths iterate over messages on a sock, or conn, they must
 * also lock the conn, or sock, to remove the message from those lists too.
 * Testing the flag to determine if the message is still on the lists lets
 * us avoid testing the list_head directly.  That means each path can use
 * the message's list_head to keep it on a local list while juggling locks
 * without confusing the other path.
 *
 * m_ack_seq is an optional field set by transports who need a different
 * sequence number range to invalidate.  They can use this in a callback
 * that they pass to rds_send_drop_acked() to see if each message has been
 * acked.  The HAS_ACK_SEQ flag can be used to detect messages which haven't
 * had ack_seq set yet.
 */
#define RDS_MSG_ON_SOCK 1
#define RDS_MSG_ON_CONN 2
#define RDS_MSG_HAS_ACK_SEQ 3

struct rds_message {
	atomic_t		m_refcount;
	struct list_head	m_sock_item;
	struct list_head	m_conn_item;
	struct rds_incoming	m_inc;
	u64			m_ack_seq;
	__be32			m_daddr;
	unsigned long		m_flags;
	struct rds_sock		*m_rs;
	unsigned int		m_nents;
	unsigned int		m_count;
	struct scatterlist	m_sg[0];
};

/**
 * struct rds_transport -  transport specific behavioural hooks
 *
 * @xmit: .xmit is called by rds_send_xmit() to tell the transport to send
 *        part of a message.  The caller serializes on the send_sem so this
 *        doesn't need to be reentrant for a given conn.  The header must be
 *        sent before the data payload.  .xmit must be prepared to send a
 *        message with no data payload.  .xmit should return the number of
 *        bytes that were sent down the connection, including header bytes.
 *        Returning 0 tells the caller that it doesn't need to perform any
 *        additional work now.  This is usually the case when the transport has
 *        filled the sending queue for its connection and will handle
 *        triggering the rds thread to continue the send when space becomes
 *        available.  Returning -EAGAIN tells the caller to retry the send
 *        immediately.  Returning -ENOMEM tells the caller to retry the send at
 *        some point in the future.
 *
 * @conn_shutdown: conn_shutdown stops traffic on the given connection.  Once
 *                 it returns the connection can not call rds_recv_incoming().
 *                 This will only be called once after conn_connect returns
 *                 non-zero success and will The caller serializes this with
 *                 the send and connecting paths (xmit_* and conn_*).  The
 *                 transport is responsible for other serialization, including
 *                 rds_recv_incoming().  This is called in process context but
 *                 should try hard not to block.
 *
 * @xmit_cong_map: This asks the transport to send the local bitmap down the
 * 		   given connection.  XXX get a better story about the bitmap
 * 		   flag and header.
 */

struct rds_transport {
	struct list_head	t_item;
	struct module		*t_owner;
	char			*t_name;
	int (*laddr_check)(__be32 addr);
	int (*conn_alloc)(struct rds_connection *conn, gfp_t gfp);
	void (*conn_free)(void *data);
	int (*conn_connect)(struct rds_connection *conn);
	void (*conn_shutdown)(struct rds_connection *conn);
	void (*xmit_prepare)(struct rds_connection *conn);
	void (*xmit_complete)(struct rds_connection *conn);
	int (*xmit)(struct rds_connection *conn, struct rds_message *rm,
		    unsigned int hdr_off, unsigned int sg, unsigned int off);
	int (*xmit_cong_map)(struct rds_connection *conn,
			     struct rds_cong_map *map, unsigned long offset);
	int (*recv)(struct rds_connection *conn);
	int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov,
				size_t size);
	void (*inc_purge)(struct rds_incoming *inc);
	void (*inc_free)(struct rds_incoming *inc);
	void (*listen_stop)(void);
	unsigned int (*stats_info_copy)(struct rds_info_iterator *iter,
					unsigned int avail);
	void (*exit)(void);
};

struct rds_sock {
#ifdef KERNEL_HAS_PROTO_REGISTER
	struct sock		rs_sk;
#endif
#ifndef KERNEL_HAS_PROTO_REGISTER
	struct sock		*rs_sk;
#endif

	/*
	 * bound_addr used for both incoming and outgoing, no INADDR_ANY
	 * support.
	 */
	struct rb_node		rs_bound_node;
	__be32			rs_bound_addr;
	__be32			rs_conn_addr;
	__be16			rs_bound_port;
	__be16			rs_conn_port;

	/* 
	 * This is only used to communicate the transport between bind and
	 * initiating connections.  All other trans use is referenced through
	 * the connection.
	 */
	struct rds_transport    *rs_transport;

	/* rs_lock protects all these adjacent members before the newline */
	spinlock_t		rs_lock;
	struct list_head	rs_send_queue;
	u32			rs_snd_bytes;
	int			rs_rcv_bytes;

	/*
	 * This is protected by sk->sk_callback_lock since we use it in rx to
	 * test for races with rds_release anyway.
	 */
	struct list_head	rs_recv_queue;

	/* just for stats reporting */
	struct list_head	rs_item;
};

#ifdef KERNEL_HAS_PROTO_REGISTER
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
{
	return container_of(sk, struct rds_sock, rs_sk);
}
static inline struct sock *rds_rs_to_sk(struct rds_sock *rs)
{
	return &rs->rs_sk;
}
#endif /* KERNEL_HAS_PROTO_REGISTER */
#ifndef KERNEL_HAS_PROTO_REGISTER
static inline struct rds_sock *rds_sk_to_rs(const struct sock *sk)
{
	return (struct rds_sock *)sk->sk_protinfo;
}
static inline struct sock *rds_rs_to_sk(const struct rds_sock *rs)
{
	return rs->rs_sk;
}
#endif /* KERNEL_HAS_PROTO_REGISTER */

/*
 * The stack assigns sk_sndbuf and sk_rcvbuf to twice the specified value
 * to account for overhead.  We don't account for overhead, we just apply
 * the number of payload bytes to the specified value.
 */
static inline int rds_sk_sndbuf(struct rds_sock *rs)
{
	return rds_rs_to_sk(rs)->sk_sndbuf / 2;
}
static inline int rds_sk_rcvbuf(struct rds_sock *rs)
{
	return rds_rs_to_sk(rs)->sk_rcvbuf / 2;
}

struct rds_statistics {
	unsigned long	s_conn_reset;
	unsigned long	s_recv_drop_old_seq;
	unsigned long	s_recv_drop_no_sock;
	unsigned long	s_recv_drop_dead_sock;
	unsigned long	s_recv_deliver_raced;
	unsigned long	s_recv_delivered;
	unsigned long	s_recv_queued;
	unsigned long	s_recv_immediate_retry;
	unsigned long	s_recv_delayed_retry;
	unsigned long	s_send_queue_empty;
	unsigned long	s_send_queue_full;
	unsigned long	s_send_sem_contention;
	unsigned long	s_send_sem_queue_raced;
	unsigned long	s_send_immediate_retry;
	unsigned long	s_send_delayed_retry;
	unsigned long	s_send_drop_acked;
	unsigned long	s_page_remainder_hit;
	unsigned long	s_page_remainder_miss;
	unsigned long	s_cong_update_queued;
	unsigned long	s_cong_update_received;
	unsigned long	s_cong_send_error;
	unsigned long	s_cong_send_blocked;
};

/* af_rds.c */
void rds_sock_addref(struct rds_sock *rs);
void rds_sock_put(struct rds_sock *rs);
void rds_wake_sk_sleep(struct rds_sock *rs);

/* bind.c */
int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len);
void rds_remove_bound(struct rds_sock *rs);
struct rds_sock *rds_find_bound(__be32 addr, __be16 port);

/* cong.c */
int rds_cong_get_maps(struct rds_connection *conn);
void rds_cong_add_conn(struct rds_connection *conn);
void rds_cong_remove_conn(struct rds_connection *conn);
void rds_cong_set_bit(struct rds_cong_map *map, __be16 port);
void rds_cong_clear_bit(struct rds_cong_map *map, __be16 port);
int rds_cong_wait(struct rds_cong_map *map, __be16 port, int nonblock);
void rds_cong_queue_updates(struct rds_cong_map *map);
void rds_cong_map_updated(struct rds_cong_map *map);
void __exit rds_cong_exit(void);

/* conn.c */
int __init rds_conn_init(void);
void __exit rds_conn_exit(void);
struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
				       struct rds_transport *trans, gfp_t gfp);
void rds_conn_reset(struct rds_connection *conn);

/* message.c */
struct rds_message *rds_message_copy_from_user(struct iovec *first_iov,
					       size_t total_len);
void rds_message_populate_header(struct rds_message *rm, __be16 sport,
			         __be16 dport, u64 seq);
int rds_message_inc_copy_to_user(struct rds_incoming *inc,
				 struct iovec *first_iov, size_t size);
void rds_message_inc_purge(struct rds_incoming *inc);
void rds_message_inc_free(struct rds_incoming *inc);
void rds_message_addref(struct rds_message *rm);
void rds_message_put(struct rds_message *rm);

/* page.c */
int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
			     gfp_t gfp);
int rds_page_copy_user(struct page *page, unsigned long offset,
		       void __user *ptr, unsigned long bytes,
		       int to_user);
#define rds_page_copy_to_user(page, offset, ptr, bytes) \
	rds_page_copy_user(page, offset, ptr, bytes, 1)
#define rds_page_copy_from_user(page, offset, ptr, bytes) \
	rds_page_copy_user(page, offset, ptr, bytes, 0)
void rds_page_exit(void);

/* recv.c */
void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
		  __be32 saddr);
void rds_inc_addref(struct rds_incoming *inc);
void rds_inc_put(struct rds_incoming *inc);
void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
		       struct rds_incoming *inc, gfp_t gfp, enum km_type km);
int rds_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
		size_t size, int msg_flags);
void rds_clear_recv_queue(struct rds_sock *rs);
void rds_inc_info_copy(struct rds_incoming *inc,
		       struct rds_info_iterator *iter,
		       __be32 saddr, __be32 daddr, int flip);

/* send.c */
int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
		size_t payload_len);
void rds_send_reset(struct rds_connection *conn);
int rds_send_xmit(struct rds_connection *conn);
struct sockaddr_in;
void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest);
typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack);
void rds_send_drop_acked(struct rds_connection *conn, u64 ack,
			 is_acked_func is_acked);

/* stats.c */
RDS_DECLARE_PER_CPU(struct rds_statistics, rds_stats);
#define rds_stats_inc_which(which, member) do {		\
	rds_per_cpu(which, get_cpu()).member++;		\
	put_cpu();					\
} while (0)
#define rds_stats_inc(member) rds_stats_inc_which(rds_stats, member)
int __init rds_stats_init(void);
void rds_stats_exit(void);
void rds_stats_info_copy(struct rds_info_iterator *iter,
			 unsigned long *values, char **names, size_t nr);

/* sysctl.c */
int __init rds_sysctl_init(void);
void __exit rds_sysctl_exit(void);
extern unsigned long rds_sysctl_sndbuf_min;
extern unsigned long rds_sysctl_sndbuf_default;
extern unsigned long rds_sysctl_sndbuf_max;
extern unsigned long rds_sysctl_reconnect_min_jiffies;
extern unsigned long rds_sysctl_reconnect_max_jiffies;

/* threads.c */
int __init rds_threads_init(void);
void __exit rds_threads_exit(void);
extern struct workqueue_struct *rds_wq;
void rds_connect_worker(struct work_struct *);
void rds_shutdown_worker(struct work_struct *);
void rds_send_worker(struct work_struct *);
void rds_recv_worker(struct work_struct *);
void rds_connect_complete(struct rds_connection *conn);

/* transport.c */
int rds_trans_register(struct rds_transport *trans);
void rds_trans_unregister(struct rds_transport *trans);
struct rds_transport *rds_trans_get_preferred(__be32 addr);
void rds_trans_stop_listening(void);
unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter,
				       unsigned int avail);
int __init rds_trans_init(void);
void __exit rds_trans_exit(void);

#endif
